#Importing the necessary packages
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
#Importing the files
lap_times = pd.read_csv('lap_times.csv')
races = pd.read_csv('races.csv')
pitstops_times = pd.read_csv('pit_stops.csv')
print(f'The dimension of lap_times is: {lap_times.shape}')
print(f'The dimension of pitstops_times is: {pitstops_times.shape}')
print(f'The dimension of races is: {races.shape}')
The dimension of lap_times is: (528785, 6) The dimension of pitstops_times is: (9299, 7) The dimension of races is: (1079, 18)
print(f'The variables in lap_times include: {lap_times.columns.values}')
print('\n')
print(f'The variables in races include: {races.columns.values}')
The variables in lap_times include: ['raceId' 'driverId' 'lap' 'position' 'time' 'milliseconds'] The variables in races include: ['raceId' 'year' 'round' 'circuitId' 'name' 'date' 'time' 'url' 'fp1_date' 'fp1_time' 'fp2_date' 'fp2_time' 'fp3_date' 'fp3_time' 'quali_date' 'quali_time' 'sprint_date' 'sprint_time']
#Combine the Laptimes with the Lap years
small_races = races.loc[:, ['raceId', 'year', 'circuitId']]
laptimes_year = lap_times.merge(small_races, left_on = 'raceId', right_on = 'raceId')
print(f'The dimension of the merged data frame is: {laptimes_year.shape}')
print('\n')
print(f'The variables in merge include: {laptimes_year.columns.values}')
The dimension of the merged data frame is: (528785, 8) The variables in merge include: ['raceId' 'driverId' 'lap' 'position' 'time' 'milliseconds' 'year' 'circuitId']
#Select Races that happened after 2000
check_races = small_races.loc[small_races['year'] > 2000]
check_races
| raceId | year | circuitId | |
|---|---|---|---|
| 0 | 1 | 2009 | 1 |
| 1 | 2 | 2009 | 2 |
| 2 | 3 | 2009 | 17 |
| 3 | 4 | 2009 | 3 |
| 4 | 5 | 2009 | 4 |
| ... | ... | ... | ... |
| 1074 | 1092 | 2022 | 22 |
| 1075 | 1093 | 2022 | 69 |
| 1076 | 1094 | 2022 | 32 |
| 1077 | 1095 | 2022 | 18 |
| 1078 | 1096 | 2022 | 24 |
416 rows × 3 columns
Group raceS by the year and plot the graph. The graph tells the total number of races happened in that particular years. The count of races were less in the initial years and the number of races started increasing gradualy from 2010.
YEAR_RACE = check_races.groupby(['year'])['raceId'].sum()
YEAR_RACE.plot(kind='bar', title='Years vs Races', ylabel='NUMBER OF RACES',
xlabel='YEARS', figsize=(6, 5))
<AxesSubplot:title={'center':'Years vs Races'}, xlabel='YEARS', ylabel='NUMBER OF RACES'>
The below bar graph depicts the total milliseconds the races conducted each year. The race time is high every alternate 7 years.
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(check_races['year'].unique(),list(YEAR_RACE))
plt.ticklabel_format(style='plain')
plt.xlabel('years')
plt.ylabel('milliseconds')
plt.show()
#Categorize the laptimes as 2 categories 1950-2009 and 2011-2019
before_2000 = laptimes_year[(laptimes_year['year'] > 1950) & (laptimes_year['year'] < 2010)]
after_2000 = laptimes_year[(laptimes_year['year'] > 2010) & (laptimes_year['year'] < 2020)]
#Calculate the average lap time fro those years
X_lap_before_2000 = list(before_2000.groupby(['lap'])['milliseconds'].mean())
X_lap_after_2000 = list(after_2000.groupby(['lap'])['milliseconds'].mean())
print(len(X_lap_before_2000),len(X_lap_after_2000))
78 78
The below double line graph depicts the total miliseconds taken by each lap as an avaerage in 2 categories. The categories are 1950-2009 and 2011-2019. the lap times in 2011-2019 are more compared to the 1950-2009 lap times. The maximum lap times was in the 25 lap time. The year 1950-2009 has a constant lap duration across all the laps. The lap time across all the years decreases as the lap count increases.
#plot the graph
plt.plot(range(1,79), X_lap_before_2000, label = "2000-2009")
plt.plot(range(1,79), X_lap_after_2000, label = "2011-2019")
plt.xlabel('laps')
plt.ylabel('milliseconds')
plt.legend()
plt.show()
The below line graph we can find the total number of milliseconds the race was conducted in that particular year for a particular CircuitID. It can seen that no races were conducted in few years. Each Circuit ID maintain a contain race duration in a year with very minimal changes.
#Plot the Graph
circuitID_racetime = laptimes_year.groupby(['circuitId', 'year'])['milliseconds'].mean().to_frame().reset_index()
fig = px.line(circuitID_racetime, x = "year", y = "milliseconds", color = 'circuitId')
fig.show()
#Merge the pitstops times with the race years
pitstop_year = pitstops_times.merge(small_races, left_on = 'raceId', right_on = 'raceId')
#Group the average pitstop time of every year
pitstop_time = list(pitstop_year.groupby(['year'])['milliseconds'].mean())
The below line graph tells the total number of milliseconds taken in a lap for pit stops.The pitstop time increases and decreases, but the increase is double after every downfall.
#Plot the graph
plt.plot(pitstop_year['year'].unique(), pitstop_time)
plt.xlabel('laps')
plt.ylabel('milliseconds')
plt.ticklabel_format(style='plain')
plt.show()